import numpy as np
from lets_plot import *
import pandas as pd
import vega_datasets
import seaborn as sns
LetsPlot.setup_html()Faceting
data = sns.load_dataset("penguins")
data| species | island | bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | sex | |
|---|---|---|---|---|---|---|---|
| 0 | Adelie | Torgersen | 39.1 | 18.7 | 181.0 | 3750.0 | Male |
| 1 | Adelie | Torgersen | 39.5 | 17.4 | 186.0 | 3800.0 | Female |
| 2 | Adelie | Torgersen | 40.3 | 18.0 | 195.0 | 3250.0 | Female |
| 3 | Adelie | Torgersen | NaN | NaN | NaN | NaN | NaN |
| 4 | Adelie | Torgersen | 36.7 | 19.3 | 193.0 | 3450.0 | Female |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 339 | Gentoo | Biscoe | NaN | NaN | NaN | NaN | NaN |
| 340 | Gentoo | Biscoe | 46.8 | 14.3 | 215.0 | 4850.0 | Female |
| 341 | Gentoo | Biscoe | 50.4 | 15.7 | 222.0 | 5750.0 | Male |
| 342 | Gentoo | Biscoe | 45.2 | 14.8 | 212.0 | 5200.0 | Female |
| 343 | Gentoo | Biscoe | 49.9 | 16.1 | 213.0 | 5400.0 | Male |
344 rows × 7 columns
ggplot() + geom_point(data=data, mapping=aes(x='bill_depth_mm', y='flipper_length_mm', color='species'))ggplot() + geom_point(
data=data,
mapping=aes(
x='bill_depth_mm',
y='flipper_length_mm',
)
) + facet_grid(x='species')ggplot() + geom_point(
data=data,
mapping=aes(
x='bill_depth_mm',
y='flipper_length_mm',
color='species'
)
) + facet_grid(x='species')ggplot()+ geom_point(
data=data[['bill_depth_mm', 'flipper_length_mm']],
mapping=aes(
x='bill_depth_mm',
y='flipper_length_mm',
), color='lightgrey'
) + geom_point(
data=data,
mapping=aes(
x='bill_depth_mm',
y='flipper_length_mm',
color='species'
)
) + facet_grid(x='species')ggplot()+ geom_point(
data=data[['bill_depth_mm', 'flipper_length_mm']],
mapping=aes(
x='bill_depth_mm',
y='flipper_length_mm',
)
) + geom_point(
data=data,
mapping=aes(
x='bill_depth_mm',
y='flipper_length_mm',
color='species'
)
) + facet_grid(x='species', y='sex')Visualizing many variables
ggplot() + geom_point(data=data, mapping=aes(x='bill_depth_mm', y='flipper_length_mm', size='bill_length_mm', color='body_mass_g'))ggplot() + geom_point(
data=data,
mapping=aes(
x='bill_depth_mm',
y='flipper_length_mm',
color='bill_length_mm',
size='body_mass_g',
)
) + facet_grid(x='species', y='sex')melted = data.reset_index().melt(id_vars=["index", "species", "island", "sex"])
merged = melted.merge(melted, on=["index", "species", "island", "sex"])
ggplot() + geom_point(
data=merged,
mapping=aes(
x='value_x',
y='value_y',
color='species'
)
) + facet_grid(x='variable_x', y='variable_y', scales='free')import altair as alt
for field in ['bill_depth_mm', 'flipper_length_mm', 'bill_length_mm', 'body_mass_g']:
data[field] = (data[field] - data[field].mean()) / data[field].std()
alt.Chart(data, width=500).transform_window(
index='count()'
).transform_fold(
['bill_depth_mm', 'flipper_length_mm', 'bill_length_mm', 'body_mass_g']
).mark_line().encode(
x='key:N',
y='value:Q',
color='species:N',
detail='index:N',
opacity=alt.value(0.5)
)import altair as alt
from vega_datasets import data
source = data.iris()
alt.Chart(source, width=500).transform_window(
index='count()'
).transform_fold(
['petalLength', 'petalWidth', 'sepalLength', 'sepalWidth']
).mark_line().encode(
x='key:N',
y='value:Q',
color='species:N',
detail='index:N',
opacity=alt.value(0.5)
)Tiles and raster marks
from vega_datasets import data
data = data.windvectors()data| longitude | latitude | dir | dirCat | speed | |
|---|---|---|---|---|---|
| 0 | 0.125 | 45.125 | 228 | 225 | 3.12 |
| 1 | 0.375 | 45.125 | 228 | 225 | 3.24 |
| 2 | 0.625 | 45.125 | 229 | 225 | 3.34 |
| 3 | 0.875 | 45.125 | 229 | 225 | 3.44 |
| 4 | 1.125 | 45.125 | 228 | 225 | 3.48 |
| ... | ... | ... | ... | ... | ... |
| 4795 | -1.125 | 59.875 | 155 | 150 | 5.96 |
| 4796 | -0.875 | 59.875 | 154 | 150 | 6.34 |
| 4797 | -0.625 | 59.875 | 153 | 150 | 6.71 |
| 4798 | -0.375 | 59.875 | 152 | 150 | 7.09 |
| 4799 | -0.125 | 59.875 | 152 | 150 | 7.48 |
4800 rows × 5 columns
ggplot() + geom_point(data=data, mapping=aes(x='longitude', y='latitude', color='speed')) + ggsize(1000, 600)ggplot() + geom_point(data=data, mapping=aes(x='longitude', y='latitude', color='speed')) + scale_color_brewer('div', palette='RdBu', direction=-1) + ggsize(1000, 600)ggplot() + geom_tile(data=data, mapping=aes(x='longitude', y='latitude', fill='speed')) + scale_fill_brewer('div', palette='RdBu', direction=-1) + ggsize(1000, 600)ggplot() + geom_raster(data=data, mapping=aes(x='longitude', y='latitude', fill='speed')) + scale_fill_brewer('div', palette='RdBu', direction=-1) + ggsize(1000, 600)2-D Histograms
from vega_datasets import data
data = data.movies()ggplot() + geom_point(data=data, mapping=aes(x='IMDB_Rating', y='Rotten_Tomatoes_Rating', color='Major_Genre', label='Name')) + ggsize(1000, 600)ggplot() + geom_point(
data=data[['IMDB_Rating', 'Rotten_Tomatoes_Rating']],
mapping=aes(x='IMDB_Rating', y='Rotten_Tomatoes_Rating'), color='lightgrey'
) + geom_point(
data=data,
mapping=aes(x='IMDB_Rating', y='Rotten_Tomatoes_Rating', color='Major_Genre', label='Name')
) + facet_wrap('Major_Genre') + ggsize(1600, 1000)ggplot() + geom_bin2d(data=data, mapping=aes(x='IMDB_Rating', y='Rotten_Tomatoes_Rating'), bins=[20, 20])ggplot() + geom_bin2d(data=data, mapping=aes(x='IMDB_Rating', y='Rotten_Tomatoes_Rating'), bins=[20, 20]) \
+ coord_cartesian()ggplot() + geom_bin2d(data=data, mapping=aes(x='IMDB_Rating', y='Rotten_Tomatoes_Rating'), bins=[20, 20]) \
+ coord_cartesian() + scale_fill_brewer('div', palette='Spectral', direction=-1) data| Title | US_Gross | Worldwide_Gross | US_DVD_Sales | Production_Budget | Release_Date | MPAA_Rating | Running_Time_min | Distributor | Source | Major_Genre | Creative_Type | Director | Rotten_Tomatoes_Rating | IMDB_Rating | IMDB_Votes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | The Land Girls | 146083.0 | 146083.0 | NaN | 8000000.0 | Jun 12 1998 | R | NaN | Gramercy | None | None | None | None | NaN | 6.1 | 1071.0 |
| 1 | First Love, Last Rites | 10876.0 | 10876.0 | NaN | 300000.0 | Aug 07 1998 | R | NaN | Strand | None | Drama | None | None | NaN | 6.9 | 207.0 |
| 2 | I Married a Strange Person | 203134.0 | 203134.0 | NaN | 250000.0 | Aug 28 1998 | None | NaN | Lionsgate | None | Comedy | None | None | NaN | 6.8 | 865.0 |
| 3 | Let's Talk About Sex | 373615.0 | 373615.0 | NaN | 300000.0 | Sep 11 1998 | None | NaN | Fine Line | None | Comedy | None | None | 13.0 | NaN | NaN |
| 4 | Slam | 1009819.0 | 1087521.0 | NaN | 1000000.0 | Oct 09 1998 | R | NaN | Trimark | Original Screenplay | Drama | Contemporary Fiction | None | 62.0 | 3.4 | 165.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3196 | Zack and Miri Make a Porno | 31452765.0 | 36851125.0 | 21240321.0 | 24000000.0 | Oct 31 2008 | R | 101.0 | Weinstein Co. | Original Screenplay | Comedy | Contemporary Fiction | Kevin Smith | 65.0 | 7.0 | 55687.0 |
| 3197 | Zodiac | 33080084.0 | 83080084.0 | 20983030.0 | 85000000.0 | Mar 02 2007 | R | 157.0 | Paramount Pictures | Based on Book/Short Story | Thriller/Suspense | Dramatization | David Fincher | 89.0 | NaN | NaN |
| 3198 | Zoom | 11989328.0 | 12506188.0 | 6679409.0 | 35000000.0 | Aug 11 2006 | PG | NaN | Sony Pictures | Based on Comic/Graphic Novel | Adventure | Super Hero | Peter Hewitt | 3.0 | 3.4 | 7424.0 |
| 3199 | The Legend of Zorro | 45575336.0 | 141475336.0 | NaN | 80000000.0 | Oct 28 2005 | PG | 129.0 | Sony Pictures | Remake | Adventure | Historical Fiction | Martin Campbell | 26.0 | 5.7 | 21161.0 |
| 3200 | The Mask of Zorro | 93828745.0 | 233700000.0 | NaN | 65000000.0 | Jul 17 1998 | PG-13 | 136.0 | Sony Pictures | Remake | Adventure | Historical Fiction | Martin Campbell | 82.0 | 6.7 | 4789.0 |
3201 rows × 16 columns
def density(data, x, y, bw=[0.25, 2.5]):
x_dist = ((x - data['IMDB_Rating']) / bw[0]) ** 2
y_dist = ((y - data['Rotten_Tomatoes_Rating']) / bw[1]) ** 2
return np.exp(-(x_dist + y_dist)).mean()
x_coords, y_coords = np.meshgrid(np.linspace(0, 10, 200), np.linspace(0, 100, 200))
xy = np.stack([x_coords.flatten(), y_coords.flatten()]).T
data = pd.DataFrame(dict(
x=xy[:, 0],
y=xy[:, 1],
density=[density(data, xi, yi) for (xi, yi) in xy]
))
ggplot() + geom_raster(data=data, mapping=aes(x='x', y='y', fill='density')) + scale_fill_brewer('div', palette='RdBu', direction=-1) + ggsize(1000, 600) + coord_cartesian()ggplot() + geom_density2df(data=data, mapping=aes(x='IMDB_Rating', y='Rotten_Tomatoes_Rating', fill='..level..'), bins=20, show_legend=True) + scale_fill_brewer('div', palette='Spectral', direction=-1) + coord_cartesian()ggplot() + geom_density2df(data=data, mapping=aes(x='IMDB_Rating', y='Rotten_Tomatoes_Rating', fill='..level..'), bw=[0.25, 2.5], bins=20, show_legend=True) + scale_fill_brewer('div', palette='Spectral', direction=-1) + coord_cartesian()ggplot() + geom_density2df(data=data, mapping=aes(x='IMDB_Rating', y='Rotten_Tomatoes_Rating', fill='..group..'), show_legend=False) + scale_fill_brewer('div', palette='Spectral', direction=-1) + coord_cartesian()Voronoi diagrams
from vega_datasets import data
airports = data.airports()
flights = data.flights_airport()airports| iata | name | city | state | country | latitude | longitude | |
|---|---|---|---|---|---|---|---|
| 0 | 00M | Thigpen | Bay Springs | MS | USA | 31.953765 | -89.234505 |
| 1 | 00R | Livingston Municipal | Livingston | TX | USA | 30.685861 | -95.017928 |
| 2 | 00V | Meadow Lake | Colorado Springs | CO | USA | 38.945749 | -104.569893 |
| 3 | 01G | Perry-Warsaw | Perry | NY | USA | 42.741347 | -78.052081 |
| 4 | 01J | Hilliard Airpark | Hilliard | FL | USA | 30.688012 | -81.905944 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 3371 | ZEF | Elkin Municipal | Elkin | NC | USA | 36.280024 | -80.786069 |
| 3372 | ZER | Schuylkill Cty/Joe Zerbey | Pottsville | PA | USA | 40.706449 | -76.373147 |
| 3373 | ZPH | Zephyrhills Municipal | Zephyrhills | FL | USA | 28.228065 | -82.155916 |
| 3374 | ZUN | Black Rock | Zuni | NM | USA | 35.083227 | -108.791777 |
| 3375 | ZZV | Zanesville Municipal | Zanesville | OH | USA | 39.944458 | -81.892105 |
3376 rows × 7 columns
ggplot() + geom_point(data=data, mapping=aes(x='longitude', y='latitude'))counts = flights.merge(airports, left_on='origin', right_on='iata')
counts| origin | destination | count | iata | name | city | state | country | latitude | longitude | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ABE | ATL | 853 | ABE | Lehigh Valley International | Allentown | PA | USA | 40.652363 | -75.440402 |
| 1 | ABE | BHM | 1 | ABE | Lehigh Valley International | Allentown | PA | USA | 40.652363 | -75.440402 |
| 2 | ABE | CLE | 805 | ABE | Lehigh Valley International | Allentown | PA | USA | 40.652363 | -75.440402 |
| 3 | ABE | CLT | 465 | ABE | Lehigh Valley International | Allentown | PA | USA | 40.652363 | -75.440402 |
| 4 | ABE | CVG | 247 | ABE | Lehigh Valley International | Allentown | PA | USA | 40.652363 | -75.440402 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5361 | YUM | IPL | 326 | YUM | Yuma MCAS-Yuma International | Yuma | AZ | USA | 32.656583 | -114.605972 |
| 5362 | YUM | LAS | 99 | YUM | Yuma MCAS-Yuma International | Yuma | AZ | USA | 32.656583 | -114.605972 |
| 5363 | YUM | LAX | 1044 | YUM | Yuma MCAS-Yuma International | Yuma | AZ | USA | 32.656583 | -114.605972 |
| 5364 | YUM | PHX | 1961 | YUM | Yuma MCAS-Yuma International | Yuma | AZ | USA | 32.656583 | -114.605972 |
| 5365 | YUM | SLC | 440 | YUM | Yuma MCAS-Yuma International | Yuma | AZ | USA | 32.656583 | -114.605972 |
5366 rows × 10 columns
counts = counts[['origin', 'count', 'latitude', 'longitude']]
counts = counts.groupby('origin').agg({
'count': 'sum',
'latitude': 'first',
'longitude': 'first',
})
counts| count | latitude | longitude | |
|---|---|---|---|
| origin | |||
| ABE | 4807 | 40.652363 | -75.440402 |
| ABI | 2660 | 32.411320 | -99.681897 |
| ABQ | 41146 | 35.040222 | -106.609194 |
| ABY | 1095 | 31.535515 | -84.194473 |
| ACK | 457 | 41.253052 | -70.060181 |
| ... | ... | ... | ... |
| WYS | 264 | 44.688399 | -111.117638 |
| XNA | 14112 | 36.281869 | -94.306811 |
| YAK | 725 | 59.503361 | -139.660226 |
| YKM | 340 | 46.568170 | -120.544059 |
| YUM | 3871 | 32.656583 | -114.605972 |
303 rows × 3 columns
counts.query('(count > 2000) & (latitude > 25) & (latitude < 50)')| count | latitude | longitude | |
|---|---|---|---|
| origin | |||
| ABE | 4807 | 40.652363 | -75.440402 |
| ABI | 2660 | 32.411320 | -99.681897 |
| ABQ | 41146 | 35.040222 | -106.609194 |
| ACV | 3714 | 40.978115 | -124.108619 |
| AEX | 2330 | 31.327372 | -92.548556 |
| ... | ... | ... | ... |
| TVC | 4262 | 44.741445 | -85.582235 |
| TYS | 14004 | 35.812487 | -83.992856 |
| VPS | 6820 | 30.483250 | -86.525400 |
| XNA | 14112 | 36.281869 | -94.306811 |
| YUM | 3871 | 32.656583 | -114.605972 |
194 rows × 3 columns
ggplot() + geom_point(data=counts.query('(count > 2000) & (latitude > 25) & (latitude < 50)'), mapping=aes(x='longitude', y='latitude'))from scipy.spatial import Voronoi
v = Voronoi(points = counts[['longitude', 'latitude']].values)
v_df = pd.DataFrame([(i, *v.vertices[v_id]) for i, r in enumerate(v.regions) \
for v_id in r if any(r) and not -1 in r],
columns=['id', 'x', 'y'])
ggplot() + \
geom_polygon(aes(x='x', y='y', group='id', fill='id'), \
data=v_df, show_legend=False, color='black', alpha=0.) + \
geom_point(aes(x='longitude', y='latitude'), data=counts, shape=21, color='black', fill='white') + \
scale_fill_discrete() + scale_x_continuous(limits=[-125, -70]) + scale_y_continuous(limits=[25, 50]) + ggsize(1200, 800)v.vertices[v.ridge_vertices].shape(897, 2, 2)
v.vertices[0]array([-122.52061032, -13.33427622])
from scipy.spatial import Voronoi
v = Voronoi(points = counts[['longitude', 'latitude']].values)
v_df = pd.DataFrame([(v.vertices[r[0]].tolist() + v.vertices[r[1]].tolist()) for i, r in enumerate(v.ridge_vertices) if any(r) and not -1 in r],
columns=['x1', 'y1', 'x2', 'y2'])
ggplot() + \
geom_segment(aes(x='x1', y='y1', xend='x2', yend='y2',), \
data=v_df, show_legend=False, color='black') + \
geom_point(aes(x='longitude', y='latitude'), data=counts, shape=21, color='black', fill='white') + \
scale_fill_discrete() + scale_x_continuous(limits=[-125, -70]) + scale_y_continuous(limits=[25, 50]) + ggsize(1200, 800)v_df| x1 | y1 | x2 | y2 | |
|---|---|---|---|---|
| 0 | -162.443216 | 63.804203 | -157.848949 | 64.376560 |
| 1 | -162.443216 | 63.804203 | -169.585957 | 56.901014 |
| 2 | -157.848949 | 64.376560 | -165.377812 | 49.900827 |
| 3 | -169.585957 | 56.901014 | -165.377812 | 49.900827 |
| 4 | -156.615711 | 65.020800 | -157.848949 | 64.376560 |
| ... | ... | ... | ... | ... |
| 883 | -90.176172 | 43.823228 | -89.330875 | 43.993172 |
| 884 | -88.647938 | 42.812487 | -88.565038 | 43.432585 |
| 885 | -91.127222 | 45.568721 | -88.856878 | 45.038552 |
| 886 | -88.306081 | 45.632959 | -88.755175 | 45.108732 |
| 887 | -87.172633 | 43.843489 | -87.947783 | 43.726042 |
888 rows × 4 columns
ggplot() + geom_point(data=counts.query('(count > 2000) & (latitude > 25) & (latitude < 50)'), mapping=aes(x='longitude', y='latitude')) + geom_segment()